#### Scraping datos del censo ####

# CUADRO A1. POBLACIÓN TOTAL POR SEXO, SEGÚN MUNICIPIO Y PARROQUIA

library(pdftools)
library(tidyverse)

# Define the folder containing the PDFs
pdf_folder <- "data/Censo_Ven_2011/censo_estados/"

# List all PDF files
pdf_files <- list.files(pdf_folder, pattern = "\\.pdf$", full.names = TRUE)

# Function to extract population data from multi-page tables
extract_population_data <- function(pdf_path) {
  # Read the entire PDF as text
  pdf_text_data <- pdf_text(pdf_path)
  
  # Find pages containing "CUADRO A1. POBLACIÓN TOTAL POR SEXO"
  page_indices <- which(str_detect(pdf_text_data, "CUADRO A1\\. POBLACIÓN TOTAL POR SEXO"))
  
  # If no match is found, return NULL
  if (length(page_indices) == 0) {
    message(paste("Skipping:", pdf_path, "- Table not found."))
    return(NULL)
  }
  
  # Extract all relevant pages
  table_text <- unlist(pdf_text_data[page_indices])
  
  # Split into lines and clean whitespace
  lines <- str_split(table_text, "\n") %>% unlist() %>% trimws()
  
  # Identify start and end points for "PARROQUIA" data
  start_index <- min(grep("^PARROQUIA|NO TIENE PARROQUIA", lines))
  end_index <- max(grep("NOTA:|FUENTE:|Total Población|Total General", lines))
  
  # Ensure indices exist
  if (length(start_index) == 0 | length(end_index) == 0 | start_index > end_index) {
    message(paste("Skipping:", pdf_path, "- Could not find complete table."))
    return(NULL)
  }
  
  # Extract relevant rows
  table_data <- lines[start_index:end_index]
  
  # Remove unwanted summary rows
  table_data <- table_data[!str_detect(table_data, "Total|NOTA:|FUENTE:")]
  
  # Split into columns using multiple spaces as separators
  table_df <- str_split_fixed(table_data, " {2,}", 2) %>% as.data.frame()
  
  # Rename columns
  colnames(table_df) <- c("Parroquia", "Población")
  
  # Keep only rows that start with "PARROQUIA"
  table_df <- table_df %>%
    filter(str_starts(Parroquia, "PARROQUIA|NO TIENE")) %>%
    mutate(Estado = str_replace(basename(pdf_path), ".pdf", ""))  # Extract state name from filename
  
  return(table_df)
}

# Apply function to all PDFs and combine results
population_data <- map_dfr(pdf_files, extract_population_data)

unique(population_data$Estado)

estado_mapping <- c(
  "anzoategui" = "ANZOATEGUI",
  "apure" = "APURE",
  "barinas" = "BARINAS",
  "carabobo" = "CARABOBO",
  "cojedes" = "COJEDES",
  "deltaamacuro" = "DELTA AMACURO",
  "DISTRITO-CAPITAL" = "DISTRITO CAPITAL",
  "ESTADO-AMAZONAS" = "AMAZONAS",
  "ESTADO-ARAGUA-1" = "ARAGUA",
  "ESTADO-BOLIVAR" = "BOLIVAR",
  "ESTADO-ZULIA" = "ZULIA",
  "falcon" = "FALCON",
  "guarico" = "GUARICO",
  "lara" = "LARA",
  "merida" = "MERIDA",
  "miranda" = "MIRANDA",
  "monagas" = "MONAGAS",
  "nuevaesparta" = "NUEVA ESPARTA",
  "portuguesa" = "PORTUGUESA",
  "sucre" = "SUCRE",
  "tachira" = "TACHIRA",
  "trujillo" = "TRUJILLO",
  "vargas" = "LA GUAIRA",
  "yaracuy" = "YARACUY"
)


# Apply the mapping to the "Estado" column in the dataset

population_data <- population_data %>%
  mutate(Estado = recode(Estado, !!!estado_mapping),
         Parroquia = str_remove_all(Parroquia, "^PARROQUIA |^NO TIENE PARROQUIA |CAPITAL |^NO URBANA |^URBANA "),
         Parroquia = str_remove_all(Parroquia, "[()]"))

# Verify the changes
table(population_data$Estado)

write.csv(population_data, "data/Censo_Ven_2011/Censo_2011_pob_mun_cleaned.csv", row.names = FALSE)
saveRDS(population_data, "data/Censo_Ven_2011/Censo_2011_pob_mun_cleaned.rds")


### PRUEBA PARA CCS ####

census_files <- list.files("data/Censo_Ven_2011/")
census_files

# Extract text from the PDF
ccs <- pdf_text("data/Censo_Ven_2011/censo_estados/nuevaesparta.pdf")

# Extract the relevant page
ccs_pob_sex <- strsplit(ccs[46], "\n")[[1]] %>% trimws()

# Filter rows between "PARROQUIA" and the statistical note
start_index <- grep("^PARROQUIA", ccs_pob_sex)
end_index <- grep("NOTA: DIVISIÓN POLÍTICO TERRITORIAL OPERATIVA PARA FINES ESTADÍSTICOS", ccs_pob_sex)

if (length(start_index) > 0 && length(end_index) > 0) {
  ccs_pob_sex <- ccs_pob_sex[start_index:end_index]
} else {
  stop("Error: Could not find expected boundaries in the text extraction.")
}

# Split columns based on multiple spaces
ccs_pob_sex <- str_split_fixed(ccs_pob_sex, " {2,}", 2) %>% as.data.frame()

# Rename columns
colnames(ccs_pob_sex) <- c("Parroquia", "Población")

# Remove unwanted rows and add state information
ccs_pob_sex <- ccs_pob_sex %>%
  filter(str_starts(Parroquia, "PARROQUIA")) %>%
  mutate(Estado = "DTTO. CAPITAL")
